import re
import json

input_path = '0721/pre-train-texts.json'
output_path = '0721/pre-train-texts.jsonl'

with open(input_path, 'r', encoding='utf-8') as f:
    content = f.read()

# 修复 {"text": "xxx": "yyy"} 变成 {"text": "xxx：yyy"}
# 注意：这里假设只有"text"字段有问题
def fix_json_obj(obj_str):
    # 匹配 {"text": "xxx": "yyy"}
    match = re.match(r'^\s*\{\s*"text"\s*:\s*"([^"]*)"\s*:\s*"([^"]*)"\s*\}\s*$', obj_str)
    if match:
        fixed = '{"text": "' + match.group(1) + '：' + match.group(2) + '"}'
        return fixed
    return obj_str

objs = re.findall(r'\{.*?\}', content, re.DOTALL)

with open(output_path, 'w', encoding='utf-8') as f:
    for obj_str in objs:
        fixed_obj_str = fix_json_obj(obj_str)
        try:
            obj = json.loads(fixed_obj_str)
            f.write(json.dumps(obj, ensure_ascii=False) + '\n')
        except Exception as e:
            print('解析失败:', fixed_obj_str)
            print(e)
